library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.6 ✔ dplyr 1.0.7
## ✔ tidyr 1.1.4 ✔ stringr 1.4.0
## ✔ readr 2.1.1 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pracma)
##
## Attaching package: 'pracma'
## The following object is masked from 'package:purrr':
##
## cross
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(stats)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
df <- read.csv("wolfberry.csv", row.names=NULL)
df[df$Y == 0, 1] = "LBPs"
df[df$Y == 2, 1] = "Dextran"
df[df$Y == 3, 1] = "Maltodextrin"
df[df$Y == 4, 1] = "Starch"
There are two general methods to perform PCA in R :
The function princomp() uses the spectral decomposition approach. The functions prcomp() and PCA()[FactoMineR] use the singular value decomposition (SVD).
inclusion1 <- paste0("X", c(1:779))
inclusion2 <- paste0("X", c(1038:1870))
inclusion3 <- paste0("X", c(1142:1870))
inclusion4 <- paste0("X", c(1298:1870))
df_sub <- df[, c("C", "Y", inclusion3)]
df.active <- df_sub %>% filter(Y != 1)
df.active <- df.active %>% filter(C == 0 | C == 0.05)
df.active_x <- df.active %>% select(-c(Y, C))
dim(df.active_x)
## [1] 200 729
We apply the SVD-based PCA to minimize the individual variance.
value_prime <- pracma::gradient(as.numeric(df.active_x[1, ]), h1 = 1)
plot(value_prime, type = "l", main = "Gradient", ylab = "", xlab = "")
plot(as.numeric(df.active_x[1, ]), type = "l", main = "Original", ylab = "", xlab = "")
derivative <- data.frame()
for (i in c(1:nrow(df.active_x))) {
value_prime <- pracma::gradient(as.numeric(df.active_x[i, ]), h1 = 1)
value_prime_prime <- pracma::gradient(value_prime, h1 = 1)
#value_prime <- c(value_prime, value_prime_prime)
derivative <- rbind(derivative, value_prime)
}
res.pca <- prcomp(derivative, scale = FALSE)
fviz_eig(res.pca)
https://plotly.com/r/pca-visualization/
prin_comp <- res.pca
explained_variance_ratio <- summary(prin_comp)[["importance"]]['Proportion of Variance',]
explained_variance_ratio <- 100 * explained_variance_ratio
components <- prin_comp[["x"]]
components <- data.frame(components)
components <- cbind(components, df.active$Y)
components$PC3 <- -components$PC3
components$PC2 <- -components$PC2
axis = list(showline=FALSE,
zeroline=FALSE,
gridcolor='#ffff',
ticklen=4,
titlefont=list(size=13))
fig <- components %>%
plot_ly() %>%
add_trace(
type = 'splom',
dimensions = list(
list(label=paste('PC 1 (',toString(round(explained_variance_ratio[1],1)),'%)',sep = ''), values=~PC1),
list(label=paste('PC 2 (',toString(round(explained_variance_ratio[2],1)),'%)',sep = ''), values=~PC2),
list(label=paste('PC 3 (',toString(round(explained_variance_ratio[3],1)),'%)',sep = ''), values=~PC3),
list(label=paste('PC 4 (',toString(round(explained_variance_ratio[4],1)),'%)',sep = ''), values=~PC4)
),
color = ~factor(`df.active$Y`), colors = c('#636EFA','#EF553B','#00CC96', '#F2C43D')
) %>%
style(diagonal = list(visible = FALSE)) %>%
layout(
legend=list(title=list(text='color')),
hovermode='closest',
dragmode= 'select',
plot_bgcolor='rgba(240,240,240, 0.95)',
xaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
yaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
xaxis2=axis,
xaxis3=axis,
xaxis4=axis,
yaxis2=axis,
yaxis3=axis,
yaxis4=axis
)
fig
prin_comp <- res.pca
components <- prin_comp[["x"]]
components <- data.frame(components)
components$PC3 <- -components$PC3
components$PC2 <- -components$PC2
components <- cbind(components, df.active$Y)
tot_explained_variance_ratio <- summary(prin_comp)[["importance"]]['Proportion of Variance',]
tot_explained_variance_ratio <- 100 * sum(tot_explained_variance_ratio)
#tit = 'Total Explained Variance = 99.48'
fig <- plot_ly(components, x = ~PC1, y = ~PC2, z = ~PC3, color = ~factor(`df.active$Y`), colors = c('#636EFA','#EF553B','#00CC96', '#F2C43D')) %>%
add_markers(size = 30)
fig <- fig %>%
layout(
scene = list(bgcolor = "#e5ecf6")
)
fig
deriv_df.active <- cbind(derivative, Y = df.active$Y)
Looks like the data label is slightly imbalanced:
library(rsample)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(417)
df.active_wo_c <- deriv_df.active
splitted <- initial_split(data = df.active_wo_c, prop = 0.8)
table(training(splitted)$Y) %>% kable() %>% kable_styling()
| Var1 | Freq |
|---|---|
| Dextran | 40 |
| LBPs | 44 |
| Maltodextrin | 38 |
| Starch | 38 |
prop.table(table(training(splitted)$Y)) %>% kable() %>% kable_styling()
| Var1 | Freq |
|---|---|
| Dextran | 0.2500 |
| LBPs | 0.2750 |
| Maltodextrin | 0.2375 |
| Starch | 0.2375 |
Split into the training and testing samples, training 160 samples, testing 40 samples.
nrow(training(splitted))
nrow(testing(splitted))
train <- training(splitted)
test <- testing(splitted)
# define models to try
models <- c("multinom", "naive_bayes", "svmLinear", "pls")
# set CV control for knn, k-folds
myfolds <- createMultiFolds(train$Y, k = 5, times = 10)
control <- trainControl("repeatedcv",
index = myfolds,
selectionFunction = "oneSE",
preProcOptions = list(pcaComp = 10))
# fit models
set.seed(1)
train_models <- lapply(models, function(model){
#print(model)
if (model != "pls") {
train(as.factor(Y) ~ .,
method = model,
data = train,
trControl = control,
metric = "Accuracy",
preProc = c("pca", "scale"))
} else {
train(as.factor(Y) ~ .,
method = model,
data = train,
trControl = control,
metric = "Accuracy",
preProc = c("scale"))
}
})
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## Warning in fitFunc(X, Y, ncomp, Y.add = Y.add, center = center, ...): No convergence in 100 iterations
names(train_models) <- models
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
object$times$everything["elapsed"])
# extract accuracy from CM in one step without creating a separate predictions vector
acc = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(cm[["overall"]]["Accuracy"])
}
)
macro_sens = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(mean(cm[["byClass"]][ , "Sensitivity"], na.rm = TRUE))
}
)
sensitivity = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(cm[["byClass"]][ , "Sensitivity"])
}
)
macro_precision = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(mean(cm[["byClass"]][ , "Precision"], na.rm = TRUE))
}
)
precision = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(cm[["byClass"]][ , "Precision"])
}
)
# extract F1 by class
F1 = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(cm[["byClass"]][ , "F1"])
}
)
# extract macro F1
F1_M = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(mean(cm[["byClass"]][ , "F1"], na.rm = TRUE))
}
)
library(kableExtra)
acc %>% kable() %>% kable_styling()
| x | |
|---|---|
| multinom.Accuracy | 1.000 |
| naive_bayes.Accuracy | 0.875 |
| svmLinear.Accuracy | 1.000 |
| pls.Accuracy | 0.625 |
sensitivity %>% kable() %>% kable_styling()
| multinom | naive_bayes | svmLinear | pls | |
|---|---|---|---|---|
| Class: Dextran | 1 | 0.9000000 | 1 | 1.00 |
| Class: LBPs | 1 | 1.0000000 | 1 | 1.00 |
| Class: Maltodextrin | 1 | 0.9166667 | 1 | 0.25 |
| Class: Starch | 1 | 0.7500000 | 1 | 0.50 |
macro_sens %>% kable() %>% kable_styling()
| x | |
|---|---|
| multinom | 1.0000000 |
| naive_bayes | 0.8916667 |
| svmLinear | 1.0000000 |
| pls | 0.6875000 |
precision %>% kable() %>% kable_styling()
| multinom | naive_bayes | svmLinear | pls | |
|---|---|---|---|---|
| Class: Dextran | 1 | 1.0 | 1 | 0.9090909 |
| Class: LBPs | 1 | 0.6 | 1 | 0.5454545 |
| Class: Maltodextrin | 1 | 1.0 | 1 | 0.5000000 |
| Class: Starch | 1 | 0.9 | 1 | 0.5000000 |
macro_precision %>% kable() %>% kable_styling()
| x | |
|---|---|
| multinom | 1.0000000 |
| naive_bayes | 0.8750000 |
| svmLinear | 1.0000000 |
| pls | 0.6136364 |
F1 %>% kable() %>% kable_styling()
| multinom | naive_bayes | svmLinear | pls | |
|---|---|---|---|---|
| Class: Dextran | 1 | 0.9473684 | 1 | 0.9523810 |
| Class: LBPs | 1 | 0.7500000 | 1 | 0.7058824 |
| Class: Maltodextrin | 1 | 0.9565217 | 1 | 0.3333333 |
| Class: Starch | 1 | 0.8181818 | 1 | 0.5000000 |
F1_M %>% kable() %>% kable_styling()
| x | |
|---|---|
| multinom | 1.0000000 |
| naive_bayes | 0.8680180 |
| svmLinear | 1.0000000 |
| pls | 0.6228992 |
#predict(mod1, newdata = test, type = "prob")
(Optional) Visualize the derivative
plot(as.numeric(train[1, 1:ncol(train)-1]), type = "l", main = train[1, "Y"])
plot(as.numeric(train[24, 1:ncol(train)-1]), type = "l", main = train[24, "Y"])
plot(as.numeric(train[2, 1:ncol(train)-1]), type = "l", main = train[2, "Y"])
plot(as.numeric(train[157, 1:ncol(train)-1]), type = "l", main = train[157, "Y"])
df_sub <- df[, c("C", "Y", inclusion3)]
df.active <- df_sub %>% filter(Y != 1)
df.active <- df.active %>% filter(C != 0.05)
df.active_x <- df.active %>% select(-c(Y, C))
dim(df.active_x)
## [1] 800 729
deriv_df.active <- cbind(derivative, Y = df.active$Y)
derivative <- data.frame()
for (i in c(1:nrow(df.active_x))) {
value_prime <- pracma::gradient(as.numeric(df.active_x[i, ]), h1 = 1)
value_prime_prime <- pracma::gradient(value_prime, h1 = 1)
#value_prime <- c(value_prime, value_prime_prime)
derivative <- rbind(derivative, value_prime)
}
test <- cbind(derivative, Y = df.active$Y)
table(test$Y) %>% kable() %>% kable_styling()
| Var1 | Freq |
|---|---|
| Dextran | 250 |
| LBPs | 50 |
| Maltodextrin | 250 |
| Starch | 250 |
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
object$times$everything["elapsed"])
# extract accuracy from CM in one step without creating a separate predictions vector
acc = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(cm[["overall"]]["Accuracy"])
}
)
macro_sens = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(mean(cm[["byClass"]][ , "Sensitivity"], na.rm = TRUE))
}
)
sensitivity = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(cm[["byClass"]][ , "Sensitivity"])
}
)
macro_precision = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(mean(cm[["byClass"]][ , "Precision"], na.rm = TRUE))
}
)
precision = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(cm[["byClass"]][ , "Precision"])
}
)
# extract F1 by class
F1 = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(cm[["byClass"]][ , "F1"])
}
)
# extract macro F1
F1_M = sapply(train_models, function(x){
pred = predict(x, newdata = test)
cm = confusionMatrix(pred, reference = as.factor(test$Y))
return(mean(cm[["byClass"]][ , "F1"], na.rm = TRUE))
}
)
acc %>% kable() %>% kable_styling()
| x | |
|---|---|
| multinom.Accuracy | 0.98125 |
| naive_bayes.Accuracy | 0.39250 |
| svmLinear.Accuracy | 0.95500 |
| pls.Accuracy | 0.53500 |
sensitivity %>% kable() %>% kable_styling()
| multinom | naive_bayes | svmLinear | pls | |
|---|---|---|---|---|
| Class: Dextran | 0.944 | 0.288 | 1.000 | 0.912 |
| Class: LBPs | 1.000 | 1.000 | 1.000 | 1.000 |
| Class: Maltodextrin | 1.000 | 0.304 | 1.000 | 0.008 |
| Class: Starch | 0.996 | 0.464 | 0.856 | 0.592 |
macro_sens %>% kable() %>% kable_styling()
| x | |
|---|---|
| multinom | 0.985 |
| naive_bayes | 0.514 |
| svmLinear | 0.964 |
| pls | 0.628 |
precision %>% kable() %>% kable_styling()
| multinom | naive_bayes | svmLinear | pls | |
|---|---|---|---|---|
| Class: Dextran | 0.9957806 | 0.2345277 | 0.8771930 | 0.7261146 |
| Class: LBPs | 1.0000000 | 0.4761905 | 0.9803922 | 0.9433962 |
| Class: Maltodextrin | 0.9469697 | 0.5277778 | 1.0000000 | 0.0952381 |
| Class: Starch | 1.0000000 | 0.4754098 | 1.0000000 | 0.3592233 |
macro_precision %>% kable() %>% kable_styling()
| x | |
|---|---|
| multinom | 0.9856876 |
| naive_bayes | 0.4284764 |
| svmLinear | 0.9643963 |
| pls | 0.5309931 |
F1 %>% kable() %>% kable_styling()
| multinom | naive_bayes | svmLinear | pls | |
|---|---|---|---|---|
| Class: Dextran | 0.9691992 | 0.2585278 | 0.9345794 | 0.8085106 |
| Class: LBPs | 1.0000000 | 0.6451613 | 0.9900990 | 0.9708738 |
| Class: Maltodextrin | 0.9727626 | 0.3857868 | 1.0000000 | 0.0147601 |
| Class: Starch | 0.9979960 | 0.4696356 | 0.9224138 | 0.4471299 |
F1_M %>% kable() %>% kable_styling()
| x | |
|---|---|
| multinom | 0.9849895 |
| naive_bayes | 0.4397779 |
| svmLinear | 0.9617731 |
| pls | 0.5603186 |
set.seed(417)
df.active_dex <- df %>% filter(Y == "Dextran") %>% select(-Y)
splitted <- initial_split(data = df.active_dex, prop = 0.8)
train <- training(splitted)
test <- testing(splitted)
table(train$C) %>% kable() %>% kable_styling()
| Var1 | Freq |
|---|---|
| 0.05 | 42 |
| 0.1 | 37 |
| 0.3 | 42 |
| 0.5 | 41 |
| 0.7 | 39 |
| 1 | 39 |
https://topepo.github.io/caret/model-training-and-tuning.html
# define models to try
models <- c("lm", "lasso", "ranger", "pls")
# set CV control for knn, k-folds
myfolds <- createMultiFolds(train$C, k = 5, times = 10)
control <- trainControl("repeatedcv",
index = myfolds,
selectionFunction = "oneSE",
preProcOptions = list(pcaComp = 10))
# fit models
set.seed(1)
train_models <- lapply(models, function(model){
if (model != "pls") {
train(C ~ .,
method = model,
data = train,
trControl = control,
metric = "RMSE",
preProc = c("pca", "scale"))
} else {
train(C ~ .,
method = model,
data = train,
trControl = control,
metric = "RMSE",
preProc = c("scale"))
}
})
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
names(train_models) <- models
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
object$times$everything["elapsed"])
# extract accuracy from CM in one step without creating a separate predictions vector
rmse = sapply(train_models, function(x){
pred = predict(x, newdata = test %>% select(-C))
cm = postResample(pred = pred, obs = test$C)
}
)
rmse
## lm lasso ranger pls
## RMSE 0.07738918 0.08193581 0.08707198 0.08829405
## Rsquared 0.94939923 0.94409055 0.95698336 0.93442858
## MAE 0.04865527 0.05239330 0.06825042 0.04717642
library(ggplot2)
pred <- predict(train_models, newdata = test)$lm
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.052008, df = 117.93, p-value = 0.9586
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1270967 0.1205917
## sample estimates:
## mean of x mean of y
## 0.4517475 0.4550000
g1 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Dextran - Linear regression")
pred <- predict(train_models, newdata = test)$lasso
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.10546, df = 117.67, p-value = 0.9162
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1286114 0.1156053
## sample estimates:
## mean of x mean of y
## 0.448497 0.455000
g2 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Dextran - LASSO")
pred <- predict(train_models, newdata = test)$ranger
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.053243, df = 114.27, p-value = 0.9576
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1184748 0.1122729
## sample estimates:
## mean of x mean of y
## 0.451899 0.455000
g3 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Dextran - Random forest")
pred <- predict(train_models, newdata = test)$pls
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.09793, df = 117.93, p-value = 0.9222
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1299850 0.1177347
## sample estimates:
## mean of x mean of y
## 0.4488748 0.4550000
g4 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Dextran - Partial least square")
ggpubr::ggarrange(g1, g2, g3, g4)
set.seed(417)
df.active_dex <- df %>% filter(Y == "Maltodextrin") %>% select(-Y)
splitted <- initial_split(data = df.active_dex, prop = 0.8)
train <- training(splitted)
test <- testing(splitted)
# define models to try
models <- c("lm", "lasso", "ranger", "pls")
# set CV control for k-folds
myfolds <- createMultiFolds(train$C, k = 5, times = 10)
control <- trainControl("repeatedcv",
index = myfolds,
selectionFunction = "oneSE",
preProcOptions = list(pcaComp = 10))
# fit models
set.seed(1)
train_models <- lapply(models, function(model){
print(model)
if (model != "pls") {
train(C ~ .,
method = model,
data = train,
trControl = control,
metric = "RMSE",
preProc = c("pca", "scale"))
} else {
train(C ~ .,
method = model,
data = train,
trControl = control,
metric = "RMSE",
preProc = c("scale"))
}
})
## [1] "lm"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "lasso"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "ranger"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "pls"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
names(train_models) <- models
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
object$times$everything["elapsed"])
# extract accuracy from CM in one step without creating a separate predictions vector
rmse = sapply(train_models, function(x){
pred = predict(x, newdata = test %>% select(-C))
cm = postResample(pred = pred, obs = test$C)
}
)
rmse
## lm lasso ranger pls
## RMSE 0.07403046 0.07881876 0.06172400 0.08271762
## Rsquared 0.95460759 0.95005914 0.97455226 0.94240234
## MAE 0.05911742 0.06410679 0.04666944 0.06379732
pred <- predict(train_models, newdata = test)$lm
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.16643, df = 117.87, p-value = 0.8681
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1336747 0.1129472
## sample estimates:
## mean of x mean of y
## 0.4446363 0.4550000
g1 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Maltodextrin - Linear regression")
pred <- predict(train_models, newdata = test)$lasso
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.14762, df = 117.39, p-value = 0.8829
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1300717 0.1120258
## sample estimates:
## mean of x mean of y
## 0.4459771 0.4550000
g2 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Maltodextrin - LASSO")
pred <- predict(train_models, newdata = test)$ranger
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = 0.050847, df = 116.85, p-value = 0.9595
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1164702 0.1226083
## sample estimates:
## mean of x mean of y
## 0.4580691 0.4550000
g3 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Maltodextrin - Random forest")
pred <- predict(train_models, newdata = test)$pls
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.074878, df = 117.8, p-value = 0.9404
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1274305 0.1181450
## sample estimates:
## mean of x mean of y
## 0.4503573 0.4550000
g4 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Maltodextrin - Partial least square")
ggpubr::ggarrange(g1, g2, g3, g4)
set.seed(417)
df.active_dex <- df %>% filter(Y == "Starch") %>% select(-Y)
splitted <- initial_split(data = df.active_dex, prop = 0.8)
train <- training(splitted)
test <- testing(splitted)
# define models to try
models <- c("lm", "lasso", "ranger", "pls")
# set CV control for knn, k-folds
myfolds <- createMultiFolds(train$C, k = 5, times = 10)
control <- trainControl("repeatedcv",
index = myfolds,
selectionFunction = "oneSE",
preProcOptions = list(pcaComp = 10))
# fit models
set.seed(1)
train_models <- lapply(models, function(model){
print(model)
if (model != "pls") {
train(C ~ .,
method = model,
data = train,
trControl = control,
metric = "RMSE",
preProc = c("pca", "scale"))
} else {
train(C ~ .,
method = model,
data = train,
trControl = control,
metric = "RMSE",
preProc = c("scale"))
}
})
## [1] "lm"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "lasso"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "ranger"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
##
## [1] "pls"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
names(train_models) <- models
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
object$times$everything["elapsed"])
# extract accuracy from CM in one step without creating a separate predictions vector
rmse = sapply(train_models, function(x){
pred = predict(x, newdata = test %>% select(-C))
cm = postResample(pred = pred, obs = test$C)
}
)
rmse
## lm lasso ranger pls
## RMSE 0.07880187 0.07584639 0.09978973 0.1605508
## Rsquared 0.95029595 0.95318785 0.93793593 0.8095465
## MAE 0.06049102 0.05653805 0.06107886 0.1348417
pred <- predict(train_models, newdata = test)$lm
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.21788, df = 117.99, p-value = 0.8279
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1397999 0.1120859
## sample estimates:
## mean of x mean of y
## 0.441143 0.455000
g1 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Starch - Linear regression")
pred <- predict(train_models, newdata = test)$lasso
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.20063, df = 117.72, p-value = 0.8413
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1347624 0.1099678
## sample estimates:
## mean of x mean of y
## 0.4426027 0.4550000
g2 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Starch - LASSO")
pred <- predict(train_models, newdata = test)$ranger
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = -0.45311, df = 114.62, p-value = 0.6513
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.14230377 0.08932138
## sample estimates:
## mean of x mean of y
## 0.4285088 0.4550000
g3 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Starch - Random forest")
pred <- predict(train_models, newdata = test)$pls
plot_table <- data.frame(pred = pred, test = test$C)
t.test(plot_table$pred, plot_table$test)
##
## Welch Two Sample t-test
##
## data: plot_table$pred and plot_table$test
## t = 0.059375, df = 108.23, p-value = 0.9528
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1067038 0.1132939
## sample estimates:
## mean of x mean of y
## 0.458295 0.455000
g4 <- plot_table %>%
ggplot() +
geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() +
xlab("Predicted concentration") +
ylab("True concentration") +
ggtitle("Starch - Partial least square")
ggpubr::ggarrange(g1, g2, g3, g4)